Extracting SQL code from SSIS dtsx packages with Python lxml


In [ ]:
# imports
import os
from lxml import etree

In [ ]:
# set sql output directory
sql_out = r"C:\temp\dtsxsql"
if not os.path.isdir(sql_out):
    os.makedirs(sql_out)

In [ ]:
# set dtsx package file
ssis_dtsx = r'C:\temp\dtsx\ParseXML.dtsx'
if not os.path.isfile(ssis_dtsx):
    print("no package file")

In [ ]:
# read and parse ssis package
tree = etree.parse(ssis_dtsx)
root = tree.getroot()
root.tag

In [ ]:
# collect unique lxml transformed element tags
ele_tags = set()
for ele in root.xpath(".//*"):
    ele_tags.add(ele.tag)
print(ele_tags)
print(len(ele_tags))

In [ ]:
# scan package tree and extract sql source code
total_bytes = 0
package_name = root.attrib['{www.microsoft.com/SqlServer/Dts}ObjectName'].replace(" ","")
for cnt, ele in enumerate(root.xpath(".//*")):
    if ele.tag == "{www.microsoft.com/SqlServer/Dts}Executable":
        attr = ele.attrib
        for child0 in ele:
            if child0.tag == "{www.microsoft.com/SqlServer/Dts}ObjectData":
                for child1 in child0:
                    sql_comment = attr["{www.microsoft.com/SqlServer/Dts}ObjectName"].strip()
                    if child1.tag == "{www.microsoft.com/sqlserver/dts/tasks/sqltask}SqlTaskData":
                        dtsx_sql = child1.attrib["{www.microsoft.com/sqlserver/dts/tasks/sqltask}SqlStatementSource"]
                        dtsx_sql = "-- " + sql_comment + "\n" + dtsx_sql
                        sql_file = sql_out + "\\" + package_name + str(cnt) + ".sql"
                        total_bytes += len(dtsx_sql)
                        print((len(dtsx_sql), sql_comment, sql_file))
                        with open(sql_file, "w") as file:
                            file.write(dtsx_sql)
print(('total sql bytes',total_bytes))

In [ ]: